🆕 新增/修改的程式碼
通用摘要器:可直接餵 Day 8 逐字稿或任何長文,會輸出精煉摘要+章節重點。
// src/utils/summarize.js
import { openai } from "../aiClient.js";
import { PromptBuilder } from "../promptBuilder.js";
/**
const pb = new PromptBuilder()
.setRole("你是專業的中文內容編輯與配音腳本撰寫者")
.setGoal("將長文整理成適合口語朗讀的稿件,並提供 4~8 條節點大綱")
.addConstraint("語句自然,避免過長句;易於 TTS 朗讀")
.addConstraint("避免虛構資訊;保留關鍵數據與結論")
.addConstraint("輸出包含:『摘要稿』與『大綱列表』兩部分,以 JSON 格式")
.setFormatHint(長度提示:${length === "short" ? "約 120~180 字" : length === "long" ? "約 600~900 字" : "約 250~400 字"},語氣:${tone}
)
.setUserInput(text);
const res = await openai.chat.completions.create({
model: "gpt-4o-mini",
temperature: 0.4,
messages: [
{ role: "system", content: pb.buildSystemPrompt() },
{ role: "user", content: "請以純 JSON 回覆:{"summary":"...","outline":["...", ...]}" }
]
});
const raw = res.choices?.[0]?.message?.content?.trim() || "{}";
// 容錯解析
const json = raw.match(/(?:json)?\s*([\s\S]*?)
/i)?.[1] ?? raw;
const obj = JSON.parse(json);
if (!obj.summary || !Array.isArray(obj.outline)) {
throw new Error("摘要結果格式不正確");
}
return obj;
}
兩條管線皆會把中繼內容與最終音檔落在 outputs/mm//...。
// src/day10_multimodal.js
import fs from "fs";
import path from "path";
import { imageToJson } from "./day7_image_to_text.js";
import { speak } from "./day9_text_to_speech.js";
import { transcribe } from "./day8_speech_to_text.js";
import { summarizeForAudio } from "./utils/summarize.js";
function ensureDir(dir) {
if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true });
}
/**
// 1) 圖片 → JSON 描述
const desc = await imageToJson({ imagePath, imageUrl, wantOCR, length });
const jsonPath = path.join(outDir, desc_${Date.now()}.json
);
fs.writeFileSync(jsonPath, JSON.stringify(desc, null, 2), "utf-8");
// 2) JSON → 口播稿(把 title + description 合併成易唸文本)
const script = ${desc.title}。${desc.description}
;
// 3) 口播稿 → TTS
const { filepath } = await speak({
text: script,
voice,
format,
outputDir: outDir,
filename: narration_${Date.now()}
});
return { jsonPath, audioPath: filepath };
}
/**
// 1) STT:音檔 → 逐字稿
const { text, saved } = await transcribe({ filePath, url, language: lang, prompt, detailed: false });
// 2) 長文 → 口語化摘要(適合朗讀)
const { summary, outline } = await summarizeForAudio(text, { tone: "friendly", length });
const summaryObj = { outline, summary };
const baseName = path.basename(saved.txt).replace(/.[^.]+$/, "");
const summaryPath = path.join(outDir, ${baseName}_summary.json
);
fs.writeFileSync(summaryPath, JSON.stringify(summaryObj, null, 2), "utf-8");
// 3) 摘要 → TTS(Podcast 口吻)
const script = 這是今天會議的口語化摘要。${summary}
;
const { filepath } = await speak({
text: script,
voice,
format,
outputDir: outDir,
filename: ${baseName}_podcast
});
return { transcriptPath: saved.txt, summaryPath, audioPath: filepath };
}
const args = Object.fromEntries(
process.argv.slice(2).reduce((acc, cur, i, arr) => {
if (cur.startsWith("--")) {
const key = cur.replace(/^--/, "");
const val = arr[i + 1] && !arr[i + 1].startsWith("--") ? arr[i + 1] : true;
acc.push([key, val]);
}
return acc;
}, [])
);
async function main() {
const task = args.task || "chat";
if (task === "mm") {
const mode = args.mode || "photo_narration";
if (mode === "photo_narration") {
const out = await photoNarration({
imagePath: args.imagePath || null,
imageUrl: args.imageUrl || null,
wantOCR: args.ocr === "true" || args.ocr === true,
length: args.length || "medium",
voice: args.voice || "alloy",
format: args.format || "mp3",
});
console.log("\n=== Photo Narration ===");
console.log("描述 JSON:", out.jsonPath);
console.log("語音檔:", out.audioPath);
} else if (mode === "transcript_podcast") {
const out = await meetingToPodcast({
filePath: args.filePath || null,
url: args.url || null,
lang: args.lang || "zh",
prompt: args.prompt || "",
length: args.length || "medium",
voice: args.voice || "aria",
format: args.format || "mp3",
});
console.log("\n=== Meeting → Podcast ===");
console.log("逐字稿 TXT:", out.transcriptPath);
console.log("摘要 JSON:", out.summaryPath);
console.log("Podcast 語音:", out.audioPath);
} else {
console.log("未知模式,請使用 --mode photo_narration | transcript_podcast");
}
} else if (task === "tts") {
const text = args.text || "";
const file = args.file || "";
const model = args.model || process.env.OPENAI_TTS_MODEL || "gpt-4o-mini-tts";
const voice = args.voice || "alloy";
const format = args.format || "mp3";
const speed = args.speed ? Number(args.speed) : 1.0;
const filename = args.out || undefined;
if (file) {
const { filepath, bytes } = await speakFromFile(file, { model, voice, format, speed, filename });
console.log("\n=== 文字檔 → 語音 ===");
console.log("輸出:", filepath, (${bytes} bytes)
);
} else {
const content = text || "這是一段測試用的語音。";
const { filepath, bytes } = await speak({ text: content, model, voice, format, speed, filename });
console.log("\n=== 文字 → 語音 ===");
console.log("輸出:", filepath, (${bytes} bytes)
);
}
} else if (task === "stt") {
const filePath = args.filePath || null;
const url = args.url || null;
const language = args.lang || "";
const prompt = args.prompt || "";
const detailed = args.detailed === "true" || args.detailed === true;
const { text, saved } = await transcribe({ filePath, url, language, prompt, detailed });
console.log("\n=== 語音轉文字(STT) ===\n");
console.log(text);
console.log("\n已儲存:", saved);
} else if (task === "vision") {
const imagePath = args.imagePath || null;
const imageUrl = args.imageUrl || null;
const wantOCR = args.ocr === "true" || args.ocr === true;
const length = args.length || "medium";
const out = await imageToJson({ imagePath, imageUrl, wantOCR, length });
console.log("\n=== 圖片 → JSON 描述 ===\n");
console.log(JSON.stringify(out, null, 2));
} else if (task === "image") {
const prompt = args.text || "一隻戴著太空頭盔的柴犬,漂浮在月球上,插著台灣國旗";
const size = args.size || "512x512";
const n = args.n ? Number(args.n) : 1;
const urls = await textToImage(prompt, { size, n });
console.log("\n=== 生成圖片 ===\n");
urls.forEach((f) => console.log("已儲存:" + f));
} else if (task === "chat") {
const sessionId = args.session || "default";
if (args.reset) {
resetSession(sessionId);
console.log(已重設會話:${sessionId}
);
return;
}
const input = args.text || "嗨,我想規劃 3 天 2 夜的台中旅遊行程。";
const { reply } = await chatOnce(input, { sessionId });
console.log(\n[${sessionId}] AI:\n${reply}\n
);
} else if (task === "teacher") {
const out = await englishTeacher(args.text || "He go to school every day.");
console.log("\n=== 英文老師 ===\n");
console.log(out);
} else if (task === "review") {
const out = await codeReview("function sum(arr){ return arr.reduce((a,b)=>a+b,0) }");
console.log("\n=== 程式碼審查 ===\n");
console.log(out);
} else if (task === "sentiment") {
const out = await sentimentClassify(args.text || "今天心情糟透了,事情一團亂。");
console.log("\n=== 情緒分類(JSON) ===\n");
console.log(out);
} else if (task === "json_summary") {
const out = await newsToJson(args.text || "OpenAI 發布新模型,效能大幅提升。");
console.log("\n=== 新聞 JSON 摘要 ===\n");
console.log(out);
} else {
console.log("未知任務,請使用 --task mm | tts | stt | vision | image | chat | teacher | review | sentiment | json_summary");
}
}
main().catch((e) => {
console.error("發生錯誤:", e.message);
process.exit(1);
});
保留你原有的 scripts,只新增以下:
{
"scripts": {
"day10:photo": "node index.js --task mm --mode photo_narration --imageUrl https://images.unsplash.com/photo-1519681393784-d120267933ba --voice alloy --format mp3",
"day10:podcast": "node index.js --task mm --mode transcript_podcast --url https://example.com/demo.m4a --lang zh --prompt "專案:SmartGo Plus;術語:RAG、LIFF、IIS" --voice aria --format mp3"
}
}
▶️ 如何執行(CLI)
npm run day10:photo --silent
npm run day10:podcast --silent